In this document we will:
Divide cohort in to high and low risk groups + look at which clinical variables characterize the groups
Do differential gene expression analysis (low risk vs high risk) and plot results. Significance level: p-adj=FDR=0.05
Do a gene set enrichment analysis using fgsea. Significance level BH (Benjamini-Hochberg) adjusted p-value=FDR= 0.05
load("data/tumor_samples_with_subtype.Rdata")
#make clinical onehot
clin_vars <- c("Age", "Neuter_status", "Grade", "Lymphatic_invasion", "ER_status", "HER2_score", "Breed_Maltese", "Malignitet", "subtype_MPAM50", "Breed", "Histology")
data_for_table <- tumor_samples_with_subtype %>%
mutate(subtype_MPAM50 = as.factor(subtype_MPAM50)) %>%
kNN(variable = clin_vars, k=5) %>%
drop_na("Survival_days") %>%
select(c(Sample_ID, all_of(clin_vars), Status_survival, Survival_days)) %>%
# Only keeping variables we will use
as_tibble()
clin_vars = c("Age", "ER_status", "Lymphatic_invasion")
clinical_tbl <- tumor_samples_with_subtype %>%
mutate(subtype_MPAM50 = as.factor(subtype_MPAM50)) %>%
kNN(variable = clin_vars, k=5) %>%
drop_na("Survival_days") %>%
mutate(Age = as.vector(scale(Age))) %>%
select(c(Sample_ID, all_of(clin_vars), Status_survival, Survival_days)) %>%
# Only keeping variables we will use
as_tibble()
# Making treatment contrast groups
clinical_onehot <- clinical_tbl %>%
mutate(ER_positive = ifelse(ER_status == "N", 0, 1)) %>%
mutate(Lymphatic_invasion_present = ifelse(Lymphatic_invasion == "Absent", 0, 1)) %>%
select(-ER_status, -Lymphatic_invasion)
# Fitting the Clinical model on all dogs to get betas to predict from
Cox.clinical <- coxph(Surv(Survival_days, Status_survival) ~ Age + ER_positive + Lymphatic_invasion_present, data = clinical_onehot)
Predicted_risk_multivariate_clinical <- tibble(Sample_ID = clinical_onehot$Sample_ID, Predicted_risk= Cox.clinical$linear.predictors) %>% column_to_rownames("Sample_ID")
#distribution of risk scores
risk_histogram <- ggplot(data = Predicted_risk_multivariate_clinical, aes(Predicted_risk))+
geom_histogram(binwidth = 0.2)+
geom_vline(xintercept = -0.65, color = "red")+
theme_pubclean()+
xlab("Predicted risk score")+
ylab("")+
theme(axis.title=element_text(size=14))
risk_histogram
ggsave(risk_histogram,file="plots/risk_histogram_2025_01_20_one_model.jpeg", height = 4, width = 6.4)
range(Predicted_risk_multivariate_clinical$Predicted_risk)
## [1] -2.739613 1.963318
median(Predicted_risk_multivariate_clinical$Predicted_risk)
## [1] -0.6041138
#Stratifying cohort into low-risk and high-risk groups
high_risk_dogs <- Predicted_risk_multivariate_clinical %>%
filter(Predicted_risk>=median(Predicted_risk_multivariate_clinical$Predicted_risk)) %>%
rownames()
low_risk_dogs <- Predicted_risk_multivariate_clinical %>%
filter(Predicted_risk<median(Predicted_risk_multivariate_clinical$Predicted_risk)) %>%
rownames()
# adding risk-info to clinical onehot and clinical_tbl
clinical_onehot <- clinical_onehot %>%
mutate("Risk_group_high" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, 1, 0))
clinical_tbl <- clinical_tbl %>%
mutate("Risk_group" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, "high", "low"))
data_for_table<- data_for_table %>%
mutate("Risk_group" = ifelse(clinical_onehot$Sample_ID %in% high_risk_dogs, "high", "low"))
# Univariate cox to get Hazard ratio
coxph(Surv(Survival_days, Status_survival) ~ Risk_group_high, data = clinical_onehot)
## Call:
## coxph(formula = Surv(Survival_days, Status_survival) ~ Risk_group_high,
## data = clinical_onehot)
##
## coef exp(coef) se(coef) z p
## Risk_group_high 0.9845 2.6764 0.3511 2.804 0.00505
##
## Likelihood ratio test=8.54 on 1 df, p=0.003472
## n= 146, number of events= 38
# KM plot of the risk groups
fitrisk <- survfit(Surv(clinical_tbl$Survival_days, clinical_tbl$Status_survival)~Risk_group, data=clinical_tbl)
riskKM.plot <- ggsurvplot(fitrisk, data=clinical_tbl,
pval = T,
conf.int=F,
legend="none",
palette= c("deeppink3", "chartreuse3"),
font.title = 25,
font.x = 18,
font.y = 18,
xlab = "Days")
riskKM.plot$plot <- riskKM.plot$plot +
ggplot2::annotate("text",
x = 120, y = 0.1, # x and y coordinates of the text
label = "HR = 2.68", size = 5)
ggsave(file = "plots/riskKM.plot.one_model.png", plot= print(riskKM.plot$plot), height = 4.1, width = 5.7)
tablerisk <- table_descriptive <- tableby(Risk_group ~ Age + Breed_Maltese + Neuter_status + subtype_MPAM50 + Lymphatic_invasion + ER_status + HER2_score +Grade, data = data_for_table)
# labels(table_descriptive) <- c(Neuter_status = "Neuter status", Breed_Maltese = "Breed group", Lymphatic_invasion = "Lymphatic invasion", ER_status = "ER status", HER2_score = "HER2 score", subtype_MPAM50 = "MPAM50 Subtype") # Does not work
summary(tablerisk)
| high (N=74) | low (N=72) | Total (N=146) | p value | |
|---|---|---|---|---|
| Age | < 0.001 | |||
| Â Â Â Mean (SD) | 13.851 (1.576) | 9.847 (2.324) | 11.877 (2.816) | |
| Â Â Â Range | 10.000 - 19.000 | 2.000 - 12.000 | 2.000 - 19.000 | |
| Breed_Maltese | 0.229 | |||
| Â Â Â Maltese | 18 (24.3%) | 24 (33.3%) | 42 (28.8%) | |
| Â Â Â Non-Maltese | 56 (75.7%) | 48 (66.7%) | 104 (71.2%) | |
| Neuter_status | 0.006 | |||
| Â Â Â Intact | 43 (58.1%) | 57 (79.2%) | 100 (68.5%) | |
| Â Â Â Neutered | 31 (41.9%) | 15 (20.8%) | 46 (31.5%) | |
| subtype_MPAM50 | 0.002 | |||
| Â Â Â Basal | 40 (54.1%) | 21 (29.2%) | 61 (41.8%) | |
| Â Â Â LumA | 34 (45.9%) | 51 (70.8%) | 85 (58.2%) | |
| Lymphatic_invasion | < 0.001 | |||
| Â Â Â Absent | 59 (79.7%) | 72 (100.0%) | 131 (89.7%) | |
| Â Â Â Present | 15 (20.3%) | 0 (0.0%) | 15 (10.3%) | |
| ER_status | < 0.001 | |||
| Â Â Â N | 21 (28.4%) | 2 (2.8%) | 23 (15.8%) | |
| Â Â Â P | 53 (71.6%) | 70 (97.2%) | 123 (84.2%) | |
| HER2_score | 0.094 | |||
| Â Â Â 0 | 8 (10.8%) | 3 (4.2%) | 11 (7.5%) | |
| Â Â Â 1 | 16 (21.6%) | 10 (13.9%) | 26 (17.8%) | |
| Â Â Â 2 | 30 (40.5%) | 43 (59.7%) | 73 (50.0%) | |
| Â Â Â 3 | 20 (27.0%) | 16 (22.2%) | 36 (24.7%) | |
| Grade | < 0.001 | |||
| Â Â Â Benign | 7 (9.5%) | 26 (36.1%) | 33 (22.6%) | |
| Â Â Â 1 | 31 (41.9%) | 34 (47.2%) | 65 (44.5%) | |
| Â Â Â 2 | 12 (16.2%) | 10 (13.9%) | 22 (15.1%) | |
| Â Â Â 3 | 24 (32.4%) | 2 (2.8%) | 26 (17.8%) |
We will follow the workflow described in the DESeq2 vignette. http://bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#why-after-vst-are-there-still-batches-in-the-pca-plot . Low-risk is set to reference level.
Below is a description of what the DESeq() function
does
load("data/counts_tumors.Rdata")
counts_tumors <- counts_tumors %>%
select(clinical_tbl$Sample_ID)
# Formatting gene names to syntactically valid names
rownames(counts_tumors) <- make.names(rownames(counts_tumors))
# Normalizing count data with DESeq2.
# Used clinical_tbl(which has the original variables) and not the clinical_onehot.
dds_tumor <- DESeqDataSetFromMatrix(countData = counts_tumors,
colData = clinical_tbl,
design = ~ Risk_group)
dds_tumor$Risk_group <- relevel(dds_tumor$Risk_group, ref = "low") #Setting low-riak as the reference level
### Pre-filtering: not necessary but recommended since it will decrease run time and memory size dds_tumor ------
keep <- rowSums(counts(dds_tumor)) >= 10
dds_tumor <- dds_tumor[keep,]
dds_tumor <- DESeq(dds_tumor)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 2013 genes
## -- DESeq argument 'minReplicatesForReplace' = 7
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
res <- results(dds_tumor)
datatable(as.data.frame(res), options = list(pageLength = 12, scrollX = T), caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left;', 'Differential gene expression list'))
res_save <- res %>% as.data.frame() %>% rownames_to_column(var = "Gene_name")
openxlsx::write.xlsx(res_save, 'plots/results_DGE_one_model.xlsx')
With FDR= 0.05, there are 2073 differentially expressed genes
res05 <- results(dds_tumor, alpha=0.05)
summary(res05)
##
## out of 16995 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 1493, 8.8%
## LFC < 0 (down) : 580, 3.4%
## outliers [1] : 0, 0%
## low counts [2] : 337, 2%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
sum(res05$padj < 0.05, na.rm=TRUE) #number of genes with padj below 0.05
## [1] 2073
diff_expr_results <- res %>% as.data.frame() %>% rownames_to_column(.,var = "GeneName")
sign <- res$padj < 0.05 & diff_expr_results$log2FoldChange < -1 | diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange > 1
not_sign <- !sign
sign_low_risk <- diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange < -1
sign_high_risk <- diff_expr_results$padj < 0.05 & diff_expr_results$log2FoldChange > 1
volcano_tumor <- ggplot(diff_expr_results, aes(log2FoldChange, -log10(padj)))+
geom_point(data = diff_expr_results[not_sign,],alpha = 0.5, size = 1)+
geom_point(data = diff_expr_results[sign_high_risk,], fill = "deeppink3",color = "black", size = 3, alpha = 0.7, shape = 21)+
geom_point(data = diff_expr_results[sign_low_risk,], fill = "chartreuse3",color = "black", size = 3, alpha = 0.7, shape = 21)+
geom_text_repel(label = diff_expr_results$GeneName[sign], data = diff_expr_results[sign,], max.overlaps = 15)+
geom_hline(yintercept = -log10(0.05), color = "lightgrey", linetype = 2)+
geom_vline(xintercept = c(-1,1), color = "lightgrey", linetype = 2)+
labs(x = "Log 2 fold change", y = "-log10(FDR)")+
#lims(x = c(-2.5,2.5))+
#ggtitle("DGE low-risk vs high-risk")+
theme_classic2(base_size = 16)
volcano_tumor
ggsave(file = "plots/Volcano.plot.one_model.png", plot= volcano_tumor, height = 7, width = 8)
I have used the approach for fgsea described on this website: https://stephenturner.github.io/deseq-to-fgsea/. I used the hallmark gene set for both analyses (as this set was recommended to start with) Codes for the bottom 4 plots are adapted from the fgsea vignette: https://bioconductor.org/packages/devel/bioc/vignettes/fgsea/inst/doc/fgsea-tutorial.html
# Extracting two rows from the ranked gene list from DESeq2 (res), as we only need the gene name and test statistic.
res2 <- res %>%
as_tibble(rownames = "gene_symbol") %>%
dplyr::select(gene_symbol, stat)
#Loading hallmark pathways
pathways.hallmark <- gmtPathways("data/msigdb/msigdb_files/h.all.v2022.1.Hs.symbols.gmt")
## Run fgsea
ranks <- deframe(res2)
fgseaRes <- fgsea(pathways=pathways.hallmark, stats=ranks, nPermSimple=10000)
## Tidying: arrangring by descending NES
fgseaResTidy <- fgseaRes %>%
as_tibble() %>%
arrange(desc(NES))
The table below is ordered by descending normalized enrichment scores (NES).
#Making the table
fgseaResTidy %>%
dplyr::select(-leadingEdge, -ES) %>%
# arrange(padj) %>% #include this to arrange by padj value
datatable(. , options = list(pageLength = 12, scrollX = T), caption = htmltools::tags$caption(style = 'caption-side: top; text-align: left;', 'Top enriched pathways'))
openxlsx::write.xlsx(fgseaResTidy, 'plots/results_GSEA.xlsx')
Plot the normalized enrichment scores. The color of the bar indicates whether or not the pathway was significantly enriched:
ggplot(fgseaResTidy, aes(reorder(pathway, NES), NES)) +
geom_col(aes(fill=padj<0.05)) +
coord_flip() +
labs(x="Pathway", y="Normalized Enrichment Score",
title="GSEA Hallmark Pathways") +
theme_minimal()+
theme(axis.text.y = element_text(size = 6))
Plot of the significant pathways. Color indicates if the pathways are enriched in the low-risk group (green) or in the high-risk group (pink)
significant_pathways <- fgseaResTidy %>%
filter(padj<0.05) %>%
mutate(pathway = str_replace_all(pathway, "_", " ")) %>%
mutate(pathway = str_remove_all(pathway, "HALLMARK"))
color = ifelse(significant_pathways$NES > 0, "deeppink3", "chartreuse3")
GSEA_plot <- ggplot(significant_pathways, aes(x = reorder(pathway, NES), y = NES)) +
geom_bar(stat = "identity", fill = color, linewidth = 1)+ # evt: color = "black",
coord_flip()+
labs(x="Pathway", y="Normalized Enrichment Score") +
theme_bw()+
theme(axis.text = element_text(size = 12, face = "bold"), axis.title = element_text(size = 14, face = "bold"))
GSEA_plot
ggsave(file = "plots/GSEA.plot.one_model.png", plot= GSEA_plot, height = 13, width = 10)
From the graph above, we can easily identify the top enriched pathway in high-risk tumors: Hallmark E2F targets. E2F targets include genes/proteins involved in initiation of replication, nucleotide synthesis and DNA synthesis. The enrichment plot for this pathway is plotted below:
plotEnrichment(pathways.hallmark[["HALLMARK_E2F_TARGETS"]],
ranks) + labs(title="E2F targets pathway")
The top downregulated pathway in high-risk tumors is the myogenesis pathway.
plotEnrichment(pathways.hallmark[["HALLMARK_MYOGENESIS"]],
ranks) + labs(title="Hallmark myogenesis")
collapsedPathways <- collapsePathways(fgseaRes[order(pval)][padj < 0.01],
pathways.hallmark, ranks)
mainPathways <- fgseaRes[pathway %in% collapsedPathways$mainPathways][
order(-NES), pathway]
plotGseaTable(pathways.hallmark[mainPathways], ranks, fgseaRes,
gseaParam = 0.5, pathwayLabelStyle = list(size=8))